In this section, I will be loading the package and data required for the assignment.
library(ggplot2)
library(tidyverse)
library(gridExtra)
library(rvest)
library(stringr)
library(plotly)
library(DT)
setwd("~/")
noc_region=read_csv("noc_regions.csv")
gdp_pop=read_csv("gdp_pop.csv")
a_e=read_csv("athletes_and_events.csv")
noc_region <- noc_region %>%
mutate(NOC=replace(NOC, NOC == "URS"|NOC == "EUN", "RUS"),
NOC=replace(NOC, NOC == "SAA"|NOC == "FRG"|NOC == "GDR", "GER"),
NOC=replace(NOC, NOC == "SIN", "SGP"),
NOC=replace(NOC, NOC == "ANZ", "AUS"),
region=replace(region, NOC == "HKG", "Hong Kong")) %>%
unique()
a_e <- a_e %>%
mutate(NOC=replace(NOC, NOC == "URS"|NOC == "EUN", "RUS"),
NOC=replace(NOC, NOC == "SAA"|NOC == "FRG"|NOC == "GDR", "GER")) %>%
unique()
joined = full_join(noc_region, gdp_pop, by = c('NOC'='Code'))
joined <- joined %>%
select(NOC, region, Population, `GDP per Capita`) %>%
distinct()
colnames(joined) <- c("NOC", "Region", "Population", "GDPPerCapita")
olympics <- inner_join(x = a_e, y = joined, by = "NOC")
participation <- olympics[!is.na(olympics$Medal), ]
participationSummer <- participation[which(participation$Season=="Summer"),]
participationSummerAll <- olympics[which(olympics$Season=="Summer"),]
participationByCountry <- data.frame(table(unique(data.frame(participationSummerAll$NOC,participationSummerAll$Year))$participationSummerAll.NOC))
colnames(participationByCountry) <- c("NOC", "NumberOfGames")
NOCMedals <- participationSummer %>%
select(NOC, Year, Medal)
NOCMedals2 <- NOCMedals %>%
group_by(NOC,Medal) %>%
summarize(medals = length(Medal))
NOCMedals2 <- NOCMedals2 %>%
spread(Medal, medals)
gamesMedals <- left_join(x = participationByCountry, y = NOCMedals2, by = "NOC") %>%
arrange(desc(NumberOfGames)) %>%
head(10)
gamesMedals
## NOC NumberOfGames Bronze Gold Silver
## 1 FRA 29 587 463 567
## 2 GBR 29 620 635 729
## 3 GRE 29 84 62 109
## 4 ITA 29 454 518 474
## 5 SUI 29 139 99 178
## 6 AUT 28 53 29 88
## 7 DEN 28 177 179 236
## 8 SWE 28 358 354 396
## 9 USA 28 1197 2472 1333
## 10 AUS 27 510 342 452
b cont) I also created two visualizations, graph A showing the total medal count split by type of medal for the top 10 medal-winning NOCs, as well as graph B showing an over-time comparison of total medals won by the top 10 medal-winning NOCs, split by sex, from 1980 to 2016. I would recommend graph A to the reader as it provides a very informative view of the total number of medals won by the top 10 medal-winning NOCs, as well as the medal split by type. While graph B is informative an we can clearly see a trend of females winning more medals in more recent years, it does not show which NOCs are the major contributors to this phenomenon.
contingency <- data.frame(table(participation$NOC))
contingencySummer <- data.frame(table(participationSummer$NOC))
subsetContingency <- (head((contingencySummer[order(-contingencySummer$Freq),]), n = 10))[,1]
object <- data.frame(table(participationSummer$NOC, participationSummer$Medal))
colnames(object) <- c("NOC", "medal", "count")
temps <- filter(object, NOC %in% subsetContingency)
medalsByColor <-
ggplot(data = temps, aes(x=reorder(NOC, desc(count)), y=count, fill = medal)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Gold" = "gold", "Silver" = "#a09a95", "Bronze" = "#dd842c")) +
xlab("NOC") +
ylab("Numbers of Medals") +
ggtitle("Top 10 Medal Winning NOCs, Summer Games")
medalsByColor
object2 <- data.frame(table(participationSummer$NOC, participationSummer$Year))
temps4 <- filter(object2, Var1 %in% subsetContingency, Var2 %in% c(1980:2016))
participationTop10 <- filter(participationSummer, NOC %in% subsetContingency)
medalsOverTime <- data.frame(table(participationTop10$Year, participationTop10$Sex))
medalsSummary <- medalsOverTime %>%
mutate(Var1 = as.numeric(as.character(Var1))) %>%
filter(Var1 > 1979)
colnames(medalsSummary) <- c("Year", "Sex", "Count")
medalGenderPlot <-
ggplot(data = medalsSummary, aes(x=Year, y=Count, group = Sex, color = Sex)) +
geom_line() +
geom_point() +
xlab("Year") +
ylab("Total Numbers of Medals") +
ggtitle("Total Medals for Top 10 Medal Winning NOCs, Summer Games") +
scale_x_continuous(breaks = seq(1980, 2016, by = 4))
medalGenderPlot
Here I created 3 graphs, based on aggregate (Gold, Silver, and Bronze) medals for the top 10 medal winning NOCs. Note that this is based on both Summer and Winter Games, whereas Q1 is only Summer. graph A shows the total number of medals won by the top 10 medal-winning NOCs graph B is adjusted by population, we can see that SWE and HUN really stood out because these are small countries with a small population, whereas USA came out at the bottom since it has a huge population compared with most of the other countries on the list graph C is adjusted by GDP, we can see that RUS leads in this graph as it is the only developing country on the list with relativey low GDP.
I did not use a multiplier/scaling factor for GDP or population because I only want to show the relative standing for each country.
joined2 = merge(contingency, joined, by.x ='Var1', by.y = 'NOC')
medalsPopGDP <- joined2 %>%
na.omit() %>%
mutate(populationM = Freq/Population,
GDPM = Freq/GDPPerCapita) %>%
arrange(desc(Freq)) %>%
head(10)
medalPlot <-
ggplot(data = medalsPopGDP, aes(x=reorder(Var1, desc(Freq)), y=Freq, fill = Var1)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Medal Winning NOCs",
x = "NOC", y = "Medals") +
theme(legend.position = "none")
medalPlot3 <-
ggplot(data = medalsPopGDP, aes(x=reorder(Var1, desc(Freq)), y=populationM, fill = Var1)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Medal Winning NOCs",
subtitle = "Adjusted by Population",
x = "NOC", y = "Medals") +
theme(legend.position = "none")
medalPlot4 <-
ggplot(data = medalsPopGDP, aes(x=reorder(Var1, desc(Freq)), y=GDPM, fill = Var1)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Medal Winning NOCs",
subtitle = "Adjusted by GDP",
x = "NOC", y = "Medals") +
theme(legend.position = "none")
grid.arrange(medalPlot, medalPlot3, medalPlot4, nrow = 3)
To perform the data cleaning, I manually changed the names of countries in order to match up with country names in the csv files.
Here I created a plot showing each NOC that has hosted the Olympics from from 1980 to 2016, the blue point shows the average number of medals won per Game when the NOC is hosting the Olympics, and the red point shows the average number of medals won per Game when the NOC is not hosting. I only included medal information from 1980 to 2016 since the number of medals won in the very distant past (e.g. 1896) is less relevant as a basic of comparison since both the number of participants and disciplines are very different from today. I also only included medals won during Summer Games to be consistent with the Wikipedia table.
It looks like there is a host country advantage considering every single NOC received more medals on average during Games when they were host than when they were not.
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/Summer_Olympic_Games")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[8]], fill=TRUE)
hosts <- hosts[-1,1:3]
hosts$city <- str_split_fixed(hosts$Host, n=2, ",")[,1]
hosts$country <- str_split_fixed(hosts$Host, n=2, ",")[,2]
hosts$country <-trimws(hosts$country,"l")
hosts <- hosts[-c(6,12,13),]
hosts <- hosts %>%
mutate(country=replace(country, country == "California, United States"|country == "California, United States[30]"|country == "United States", "USA"),
country=replace(country, country == "France[30]", "France"),
country=replace(country, country == "United Kingdom", "UK"),
country=replace(country, country == "Soviet Union", "Russia"),
country=replace(country, country == "West Germany", "Germany")) %>%
left_join(noc_region, by = c('country'='region'))
countryMedal <- data.frame(table(participationSummer$Year, participationSummer$NOC)) %>%
mutate(Var1 = as.numeric(as.character(Var1)), host = "No")
for (i in 1:nrow(hosts)){
countryMedal <- countryMedal %>%
mutate(host = replace(host, Var1 == hosts$Year[i] & Var2 == hosts$NOC[i], "Yes"))
}
colnames(countryMedal) <- c("Year", "NOC", "Count", "Host")
countryMedalSummary <- countryMedal %>%
filter(Year > 1979) %>%
group_by(NOC, Host) %>%
summarize(average = mean(Count)) %>%
ungroup() %>%
mutate(NOC = factor(NOC), Host = factor(Host)) %>%
filter(NOC %in% hosts$NOC[22:33])
hostPlot <-
ggplot(data = countryMedalSummary, mapping = aes(x=average, y = NOC, colour = Host)) +
geom_point(size = 3) +
xlab("Average Number of Medals") +
ylab("NOC") +
ggtitle("Host Country Advantage")
hostPlot
successPre <-
participation %>%
group_by(Name) %>%
mutate(totalMedals = length(Medal)) %>%
arrange(desc(totalMedals)) %>%
ungroup() %>%
select(Sport, NOC, Name, Sex, totalMedals) %>%
unique()
successA <- successPre %>%
arrange(desc(totalMedals)) %>%
head(10)
athletesPlot <-
ggplot(data = successA, aes(x=reorder(Name, totalMedals), y=totalMedals)) +
geom_bar(stat = "identity", aes(fill = Sex)) +
xlab("Name") +
ylab("Medals Won") +
ggtitle("Top 10 Most Successful Athletes") +
scale_y_continuous(breaks = seq(0, 30, by = 2)) +
coord_flip()
athletesPlot
badminton <- successPre %>%
filter(Sport == "Badminton") %>%
arrange(desc(totalMedals)) %>%
head(10)
badmintonPlot <-
ggplot(data = badminton, aes(x=reorder(Name, totalMedals), y=totalMedals, color = NOC)) +
geom_bar(stat = "identity", aes(fill = NOC)) +
xlab("Name") +
ylab("Medals Won") +
ggtitle("Successful Athletes in Badminton by NOC based on Total Medals") +
scale_y_continuous(breaks = seq(0, 8, by = 1)) +
coord_flip()
badmintonPlot
Here I made two interactive plots, graph A is the host country advantage, graph B is the number of medals won by type for the top 10 medal-winning NOCs.
I chose to show the host country advantage plot so the user can see the exact number of average medals won (since the scale is in increments of 100). The total medals won by medal type plot is chosen so the user can hover over specific medal colors and see more details.
ggplotly(hostPlot)
ggplotly(medalsByColor)
Here I made a data table of the top 100 most successful athletes, ranked by number of medals won. You can search for the athlete, or sort by sport, NOC, name, sex and total medals won. I chose this data table because it’s simple to understand and provides a lot of useful information.
datatable(head(successPre, 100), options = list(
order = list(5,'desc')
))